package data_deepprocessing.algorithm.ssvm.ssvm_util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import data_deepprocessing.algorithm.ssvm.bean.WordMarkBean;
import data_deepprocessing.prepareData.beans.Node2vec_WordBookBean;
import data_deepprocessing.prepareData.db.InitSeedDB;
import data_deepprocessing.prepareData.db.Node2vec_WordBookDB;
import data_deepprocessing.prepareData.db.XianBingShi_New_zhangDB;
import data_deepprocessing.util.FilePathUtil;
import data_deepprocessing.util.FileUtil;

/**
 * @author 作者 : YUHU YUAN
 * @date 创建时间：2017年4月10日 下午8:06:30
 * @version 1.0 他跟word2vec的区别就是多一个读vector文件的过程
 */

public class CRFplusCrossValidService2Node2vec2EN {

	private InitSeedDB initSeedDB;
	private XianBingShi_New_zhangDB xianBingShi_New_zhangDB;
	private Node2vec_WordBookDB node2vec_WordBookDB;

	/**
	 * 下面是得到 word & vector模块 以下是添加的步骤
	 */
	private List<Node2vec_WordBookBean> doGetWordBookBeans() {
		return node2vec_WordBookDB.selectAllWordBook2EN();
	}

	private static Map<Integer, String> Num_WordMap = new HashMap<>();

	private static Map<String, String> Word_VectorMap = new HashMap<>();

	private void doBuildWord_VectorMap() throws IOException {
		List<Node2vec_WordBookBean> node2vec_WordBookBeans = doGetWordBookBeans();
		for (Node2vec_WordBookBean bean : node2vec_WordBookBeans) {
			Num_WordMap.put(bean.getNum(), bean.getWord());
		}

		BufferedReader reader = FileUtil.getReader(FilePathUtil.node2vec_vectorPath2EN);
		String line = "";
		line = reader.readLine();// 第一行不是向量数据不要
		while ((line = reader.readLine()) != null) {
			String[] tempStrings = line.split(" ", 2);
			Word_VectorMap.put(Num_WordMap.get(Integer.valueOf(tempStrings[0])), tempStrings[1]);
		}
	}

	/**
	 * 
	 * 以上为添加的步骤
	 */



	/**
	 * @author 作者 : YUHU YUAN
	 * @date 创建时间：2017年4月13日 下午3:48:31
	 * @parameter
	 * @return
	 * @throws 现在已字为模型就不能这么做了，它需要查两次字典：
	 *             1：word voctor 2: word B-S或者E-S 这里开头的数字只是类别，跟
	 */
	private void doGenerateCRFDataSet(String outputPath, String inputPath) throws IOException {
		File[] crfFiles = new File(inputPath).listFiles();
		BufferedWriter writer = FileUtil.getWriter(outputPath);
		List<WordMarkBean> wordMarkBeans = null;
		BufferedReader reader = null;
		StringBuffer constructTerm = new StringBuffer();
		for (File file : crfFiles) {
			reader = FileUtil.getReader(file);
			wordMarkBeans = new ArrayList<>();
			String line = "";
			while ((line = reader.readLine()) != null) {
				String[] temp = line.split("\t");
				WordMarkBean bean = new WordMarkBean(temp[0], temp[1]);
				wordMarkBeans.add(bean);
			}
			for (WordMarkBean bean : wordMarkBeans) {
				String word = bean.getWord().toLowerCase();
				if(word.equals("")){
					continue;
				}
				String[] voctor = null;
				if(Word_VectorMap.get(word)!=null)
					voctor = Word_VectorMap.get(word).split(" ");
				String mark = bean.getMark();
				String tag = "";
				if (mark.equals("O")) {
					tag = "1";
				} else if (mark.equals("B-S")) {
					tag = "2";
				} else if (mark.equals("E-S")) {
					tag = "3";
				}
				constructTerm.append(tag).append("\t");
				if (voctor != null) {
					for (int i = 0; i < voctor.length; ++i) {
						constructTerm.append((i + 1) + ":" + voctor[i]).append("\t");
					}
				} else {
					for (int i = 0; i < 200; ++i) {
						constructTerm.append((i + 1) + ":" + 1).append("\t");
					}
				}
				writer.write(constructTerm.toString());
				writer.flush();
				writer.newLine();
				constructTerm.setLength(0);
			}
			writer.newLine();

		}
		if(reader!=null){
			reader.close();
		}
		
		if (writer != null) {
			writer.close();
		}

	}

	private static String path = "D:\\yyh_yuanyuhu_graduation_experimental\\node2vec_CRF++\\dataset2word2En\\";



	public void doGenerateNode2vec2CRFCrossValidDataSet() {
		try {
			doBuildWord_VectorMap();
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}

		for (int i = 0; i < 10; ++i) {
			StringBuffer node2vecTestPath = new StringBuffer(path).append("data").append(i).append(File.separator)
					.append("test").append(File.separator).append("test.crfsuite.txt");
			StringBuffer crfTestPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator)
					.append("test");
			
			try {
				doGenerateCRFDataSet(node2vecTestPath.toString(), crfTestPath.toString());
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			StringBuffer node2vecTrainPath = new StringBuffer(path).append("data").append(i).append(File.separator)
					.append("train").append(File.separator).append("train.crfsuite.dat");
			StringBuffer crfTrainPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator)
					.append("train");
			
			try {
				doGenerateCRFDataSet(node2vecTrainPath.toString(), crfTrainPath.toString());
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			node2vecTestPath.setLength(0);
			crfTestPath.setLength(0);
			node2vecTrainPath.setLength(0);
			crfTrainPath.setLength(0);
		}

	}
	

	public InitSeedDB getInitSeedDB() {
		return initSeedDB;
	}

	public void setInitSeedDB(InitSeedDB initSeedDB) {
		this.initSeedDB = initSeedDB;
	}

	public XianBingShi_New_zhangDB getXianBingShi_New_zhangDB() {
		return xianBingShi_New_zhangDB;
	}

	public void setXianBingShi_New_zhangDB(XianBingShi_New_zhangDB xianBingShi_New_zhangDB) {
		this.xianBingShi_New_zhangDB = xianBingShi_New_zhangDB;
	}

	public Node2vec_WordBookDB getNode2vec_WordBookDB() {
		return node2vec_WordBookDB;
	}

	public void setNode2vec_WordBookDB(Node2vec_WordBookDB node2vec_WordBookDB) {
		this.node2vec_WordBookDB = node2vec_WordBookDB;
	}


}
